{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 二元分类效果的评估方法\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import roc_curve, auc\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 术语解释：\n",
    "# 真阳性：true positives, TP True => True\n",
    "# 真阴性：true negatives, TN False => False\n",
    "# 假阳性：false positives, FP False => True\n",
    "# 假阴性：false negatives, FN True => False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 准确率：分类器预测正确性的比例，可以通过LogisticRegression.score() 来计算准确率。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 精确率：预测为True的结果中正例的比例，P = TP / (TP + FP)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 召回率（灵敏度）：所有正例True被正确找出来的比例，R = TP / (TP + FN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 综合评价指标：精确率和召回率都不能从差的分类器中区分出好的分类器，综合评价指标平衡了精确率和召回率。\n",
    "# F=2 * PR / (P + R)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# ROC(Receiver Operating Characteristic)画的是分类器的召回率与误警率(fall-out)的曲线\n",
    "# AUC(Area Under Curve)，ROC曲线下方的面积,它把ROC曲线变成一个值,表示分类器随机预测的效果。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.    0.    0.25  0.25  1.  ]\n",
      "[ 0.5         0.83333333  0.83333333  1.          1.        ]\n",
      "[ 0.9   0.75  0.3   0.2   0.1 ]\n",
      "0.958333333333\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x19a8c0f9780>]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAADmNJREFUeJzt3W2IXOd5h/HrH6luKOSNaFOCpFgO\nyCUiBDksJiXQOLgusj9ItAlBCiZNcS2SVik0bcEhxQ0qpbS0jQmoTbfFpAnIjusPybaoGOJYpBg7\n1RrLTiSjsFVevJWpN2niL6F11N79MON2GK00Z3dnZ7SPrh8szJl5NHsf7erS0ZlZnVQVkqS2vGra\nA0iSxs+4S1KDjLskNci4S1KDjLskNci4S1KDjLskNci4S1KDjLskNWjrtD7xtm3bateuXdP69JK0\nKT311FPfr6qZUeumFvddu3axsLAwrU8vSZtSku92WedpGUlqkHGXpAYZd0lqkHGXpAYZd0lq0Mi4\nJ7k/yYtJvnmZx5PkM0kWkzyb5J3jH1OStBpdjtw/B+y7wuO3A7v7H4eBv1r/WJKk9Rj5Pveq+lqS\nXVdYcgD4fPWu1/dkktcneXNVvTCmGbUOc3Nw/Pi0p5A0aO9euO++jf0c4zjnvh14fmB7qX/fJZIc\nTrKQZGF5eXkMn1qjHD8Op09PewpJkzaOn1DNCveteNXtqpoD5gBmZ2e9MveE7N0LJ09OewpJkzSO\nI/clYOfA9g7gwhieV5K0RuOI+zzwof67Zt4FvOT5dkmarpGnZZI8ANwCbEuyBPwB8FMAVfVZ4ARw\nB7AI/Bj4tY0aVpLUTZd3yxwa8XgBvzm2iSRJ6+ZPqEpSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXI\nuEtSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXIuEtS\ng4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDXIuEtSg4y7JDWoU9yT7Ety\nLslikntWePz6JI8meTbJySQ7xj+qJKmrkXFPsgU4BtwO7AEOJdkztOzPgM9X1TuAo8Afj3tQSVJ3\nXY7cbwYWq+p8Vb0MPAgcGFqzB3i0f/uxFR6XJE1Ql7hvB54f2F7q3zfoGeB9/du/DLwmyRvXP54k\naS26xD0r3FdD278LvCfJ08B7gH8DLl7yRMnhJAtJFpaXl1c9rCSpmy5xXwJ2DmzvAC4MLqiqC1X1\nK1V1E/DJ/n0vDT9RVc1V1WxVzc7MzKxjbEnSlXSJ+ylgd5IbklwHHATmBxck2Zbklef6BHD/eMeU\nJK3GyLhX1UXgCPAI8BzwUFWdSXI0yf7+sluAc0m+Bfws8EcbNK8kqYOtXRZV1QngxNB99w7cfhh4\neLyjSZLWyp9QlaQGGXdJapBxl6QGGXdJapBxl6QGGXdJapBxl6QGGXdJapBxl6QGGXdJapBxl6QG\nGXdJapBxl6QGGXdJapBxl6QGGXdJapBxl6QGGXdJapBxl6QGGXdJalCnC2RfTebm4PjxaU+xeZw+\nDXv3TnsKSZO26Y7cjx/vBUvd7N0LH/zgtKeQNGmb7sgdesE6eXLaU0jS1WvTHblLkkYz7pLUIOMu\nSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUoE5xT7Ivybkki0nuWeHxtyR5LMnTSZ5Ncsf4R5UkdTUy\n7km2AMeA24E9wKEke4aW/T7wUFXdBBwE/nLcg0qSuuty5H4zsFhV56vqZeBB4MDQmgJe27/9OuDC\n+EaUJK1Wl7hvB54f2F7q3zfoU8CdSZaAE8DHVnqiJIeTLCRZWF5eXsO4kqQuusQ9K9xXQ9uHgM9V\n1Q7gDuALSS557qqaq6rZqpqdmZlZ/bSSpE66xH0J2DmwvYNLT7vcBTwEUFVPAK8Gto1jQEnS6nWJ\n+ylgd5IbklxH7wXT+aE13wNuBUjyNnpx97yLJE3JyLhX1UXgCPAI8By9d8WcSXI0yf7+st8B7k7y\nDPAA8OGqGj51I0makE4X66iqE/ReKB28796B22eBd493NEnSWvkTqpLUIOMuSQ0y7pLUIOMuSQ0y\n7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLU\nIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMu\nSQ3qFPck+5KcS7KY5J4VHv90ktP9j28l+dH4R5UkdbV11IIkW4BjwG3AEnAqyXxVnX1lTVX99sD6\njwE3bcCskqSOuhy53wwsVtX5qnoZeBA4cIX1h4AHxjGcJGltusR9O/D8wPZS/75LJLkeuAH46vpH\nkyStVZe4Z4X76jJrDwIPV9V/r/hEyeEkC0kWlpeXu84oSVqlLnFfAnYObO8ALlxm7UGucEqmquaq\naraqZmdmZrpPKUlalS5xPwXsTnJDkuvoBXx+eFGSnwPeADwx3hElSas1Mu5VdRE4AjwCPAc8VFVn\nkhxNsn9g6SHgwaq63CkbSdKEjHwrJEBVnQBODN1379D2p8Y3liRpPfwJVUlqkHGXpAYZd0lqkHGX\npAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZ\nd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lq\nkHGXpAYZd0lqUKe4J9mX5FySxST3XGbNB5KcTXImyfHxjilJWo2toxYk2QIcA24DloBTSear6uzA\nmt3AJ4B3V9UPk7xpowaWJI3W5cj9ZmCxqs5X1cvAg8CBoTV3A8eq6ocAVfXieMeUJK1Gl7hvB54f\n2F7q3zfoRuDGJI8neTLJvpWeKMnhJAtJFpaXl9c2sSRppC5xzwr31dD2VmA3cAtwCPjbJK+/5BdV\nzVXVbFXNzszMrHZWSVJHXeK+BOwc2N4BXFhhzZer6idV9W3gHL3YS5KmoEvcTwG7k9yQ5DrgIDA/\ntOZLwHsBkmyjd5rm/DgHlSR1NzLuVXUROAI8AjwHPFRVZ5IcTbK/v+wR4AdJzgKPAb9XVT/YqKEl\nSVc28q2QAFV1AjgxdN+9A7cL+Hj/Q5I0Zf6EqiQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1\nyLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhL\nUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoM6xT3JviTnkiwm\nuWeFxz+cZDnJ6f7Hr49/VElSV1tHLUiyBTgG3AYsAaeSzFfV2aGlX6yqIxswoyRplbocud8MLFbV\n+ap6GXgQOLCxY0mS1qNL3LcDzw9sL/XvG/a+JM8meTjJzrFMJ0laky5xzwr31dD2PwC7quodwFeA\nv1vxiZLDSRaSLCwvL69u0r69e3sfkqTLG3nOnd6R+uCR+A7gwuCCqvrBwObfAH+y0hNV1RwwBzA7\nOzv8F0Qn9923ll8lSdeWLkfup4DdSW5Ich1wEJgfXJDkzQOb+4HnxjeiJGm1Rh65V9XFJEeAR4At\nwP1VdSbJUWChquaB30qyH7gI/Afw4Q2cWZI0QqrWdHZk3WZnZ2thYWEqn1uSNqskT1XV7Kh1/oSq\nJDXIuEtSg4y7JDXIuEtSg4y7JDVoau+WSbIMfHeNv3wb8P0xjrMZuM/XBvf52rCefb6+qmZGLZpa\n3NcjyUKXtwK1xH2+NrjP14ZJ7LOnZSSpQcZdkhq0WeM+N+0BpsB9vja4z9eGDd/nTXnOXZJ0ZZv1\nyF2SdAVXddw7XJj7p5N8sf/415PsmvyU49Vhnz+e5Gz/qlePJrl+GnOO06h9Hlj3/iSVZNO/s6LL\nPif5QP9rfSbJ8UnPOG4dvrffkuSxJE/3v7/vmMac45Lk/iQvJvnmZR5Pks/0fz+eTfLOsQ5QVVfl\nB73/XvhfgbcC1wHPAHuG1vwG8Nn+7YP0LtI99dk3eJ/fC/xM//ZHr4V97q97DfA14ElgdtpzT+Dr\nvBt4GnhDf/tN0557Avs8B3y0f3sP8J1pz73Off4F4J3ANy/z+B3AP9G72t27gK+P8/NfzUfuXS7M\nfYD/v6Tfw8CtSVa6LOBmMXKfq+qxqvpxf/NJelfG2sy6XoD9D4E/Bf5zksNtkC77fDdwrKp+CFBV\nL054xnHrss8FvLZ/+3UMXfFts6mqr9G7vsXlHAA+Xz1PAq8fuvDRulzNce9yYe7/W1NVF4GXgDdO\nZLqN0fVi5K+4i97f/JvZyH1OchOws6r+cZKDbaAuX+cbgRuTPJ7kyST7Jjbdxuiyz58C7kyyBJwA\nPjaZ0aZmtX/eV6XLNVSnpcuFubus2Uw670+SO4FZ4D0bOtHGu+I+J3kV8GnaurpXl6/zVnqnZm6h\n96+zf07y9qr60QbPtlG67PMh4HNV9edJfh74Qn+f/2fjx5uKDe3X1XzkPvLC3INrkmyl90+5K/0z\n6GrXZZ9J8ovAJ4H9VfVfE5pto4za59cAbwdOJvkOvXOT85v8RdWu39tfrqqfVNW3gXP0Yr9Zddnn\nu4CHAKrqCeDV9P4PllZ1+vO+Vldz3EdemLu//av92+8Hvlr9Vyo2qS4XI78J+Gt6Yd/s52FhxD5X\n1UtVta2qdlXVLnqvM+yvqs18jcYu39tfovfiOUm20TtNc36iU45Xl33+HnArQJK30Yv78kSnnKx5\n4EP9d828C3ipql4Y27NP+xXlEa823wF8i96r7J/s33eU3h9u6H3x/x5YBP4FeOu0Z57APn8F+Hfg\ndP9jftozb/Q+D609ySZ/t0zHr3OAvwDOAt8ADk575gns8x7gcXrvpDkN/NK0Z17n/j4AvAD8hN5R\n+l3AR4CPDHyNj/V/P74x7u9rf0JVkhp0NZ+WkSStkXGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lq\nkHGXpAb9L88e8oPhVmT9AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x19a8a09e0b8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pred = [1, 1, 1, 0, 0, 0, 1, 0, 1, 1]\n",
    "predictions = [0.9, 0.9, 0.2, 0.1, 0.1, 0.1, 0.8, 0.3, 0.75, 0.9]\n",
    "false_positive_rate, recall, thresholds = roc_curve(pred, predictions)\n",
    "print(false_positive_rate)\n",
    "print(recall)\n",
    "print(thresholds) # 不懂的话看下面的解释\n",
    "roc_auc = auc(false_positive_rate, recall)\n",
    "print(roc_auc)\n",
    "plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc) # X轴是假阳性，Y轴是召回率。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 没看懂上面的很正常，下面开始具体讲解。\n",
    "# Y轴为 TPR 真正例率，预测为正且实际为正的样本占所有正例样本的比例。 TPR = TP / (TP + FN)\n",
    "# 横轴为 FPR 假正例率，预测为正但实际为负的样本占所有负例样本的比例。FPR = FP / (TN + FP)\n",
    "# 先YY一下直角四个角上的点：\n",
    "# (0, 0)：分类器将所有的样本都预测为负样本\n",
    "# (0, 1)：这意味着分类器很完美，因为它将所有的样本都正确分类。（所以曲线越接近左上角越好）\n",
    "# (1, 0)：这个分类器是最糟糕的，因为它成功避开了所有的正确答案。\n",
    "# (1, 1)：分类器将所有的样本都预测为正样本。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 如何画ROC曲线，先来20个值（按Score从小到大排列）：\n",
    "#  如果阈值设置为0或1，正好对应图上(0, 0)和(1, 1)两个点\n",
    "# ID    Class    Score    阈值0.52    阈值0.9\n",
    "# 1     F        0.1      F           F\n",
    "# 2     T        0.3      F           F\n",
    "# 3     F        0.33     F           F\n",
    "# 4     T        0.34     F           F\n",
    "# 5     F        0.35     F           F\n",
    "# 6     F        0.36     F           F\n",
    "# 7     F        0.37     F           F\n",
    "# 8     T        0.38     F           F\n",
    "# 9     F        0.39     F           F\n",
    "# 10    T        0.4      F           F\n",
    "# 11    F        0.505    F           F\n",
    "# 12    T        0.51     F           F\n",
    "# 13    F        0.52     T           F\n",
    "# 14    F        0.53     T           F\n",
    "# 15    T        0.54     T           F\n",
    "# 16    T        0.55     T           F\n",
    "# 17    T        0.6      T           F\n",
    "# 18    F        0.7      T           F\n",
    "# 19    T        0.8      T           F\n",
    "# 20    T        0.9      T           T\n",
    "# 阈值0.52：FPR(X) = 0.3, TPR(Y) = 0.5\n",
    "# 阈值0.9：FPR(X) = 0, TPR(Y) = 0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# AUC就简单了，根据上面取不同的阈值然后计算X和Y，最后计算面积。简单说面积越大分类器性能越好。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
